Mozilla-Ocho
diff --git a/‎Makefile
Lines changed: 4 additions & 0 deletions b/‎Makefile
Lines changed: 4 additions & 0 deletions
diff --git a/‎RELEASE.md
Lines changed: 129 additions & 0 deletions b/‎RELEASE.md
Lines changed: 129 additions & 0 deletions
diff --git a/‎llama.cpp/BUILD.mk
Lines changed: 1 addition & 1 deletion b/‎llama.cpp/BUILD.mk
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama.cpp/ggml-cuda.cu
Lines changed: 12 additions & 0 deletions b/‎llama.cpp/ggml-cuda.cu
Lines changed: 12 additions & 0 deletions
diff --git a/‎llama.cpp/ggml-cuda.h
Lines changed: 10 additions & 0 deletions b/‎llama.cpp/ggml-cuda.h
Lines changed: 10 additions & 0 deletions
diff --git a/‎llama.cpp/ggml-metal.h
Lines changed: 13 additions & 0 deletions b/‎llama.cpp/ggml-metal.h
Lines changed: 13 additions & 0 deletions
diff --git a/‎llama.cpp/ggml-metal.m
Lines changed: 28 additions & 0 deletions b/‎llama.cpp/ggml-metal.m
Lines changed: 28 additions & 0 deletions
diff --git a/‎llama.cpp/llama.cpp
Lines changed: 4 additions & 0 deletions b/‎llama.cpp/llama.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎llama.cpp/llama.h
Lines changed: 3 additions & 0 deletions b/‎llama.cpp/llama.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎llama.cpp/main/BUILD.mk
Lines changed: 3 additions & 0 deletions b/‎llama.cpp/main/BUILD.mk
Lines changed: 3 additions & 0 deletions
diff --git a/‎llama.cpp/main/main.cpp
Lines changed: 8 additions & 0 deletions b/‎llama.cpp/main/main.cpp
Lines changed: 8 additions & 0 deletions
@@ -16,6 +16,7 @@ include llamafile/BUILD.mk
 include llama.cpp/BUILD.mk
 include stable-diffusion.cpp/BUILD.mk
 include whisper.cpp/BUILD.mk
+include localscore/BUILD.mk
 
 # the root package is `o//` by default
 # building a package also builds its sub-packages
@@ -24,6 +25,7 @@ o/$(MODE)/:	o/$(MODE)/llamafile					\
 		o/$(MODE)/llama.cpp					\
 		o/$(MODE)/stable-diffusion.cpp				\
 		o/$(MODE)/whisper.cpp					\
+		o/$(MODE)/localscore					\
 		o/$(MODE)/third_party					\
 		o/$(MODE)/depend.test
 
@@ -43,6 +45,7 @@ install:	llamafile/zipalign.1					\
 		o/$(MODE)/llama.cpp/imatrix/imatrix			\
 		o/$(MODE)/llama.cpp/quantize/quantize			\
 		o/$(MODE)/llama.cpp/llama-bench/llama-bench		\
+		o/$(MODE)/localscore/localscore		\
 		o/$(MODE)/llama.cpp/perplexity/perplexity		\
 		o/$(MODE)/llama.cpp/llava/llava-quantize		\
 		o/$(MODE)/stable-diffusion.cpp/main			\
@@ -55,6 +58,7 @@ install:	llamafile/zipalign.1					\
 	$(INSTALL) o/$(MODE)/llama.cpp/imatrix/imatrix $(PREFIX)/bin/llamafile-imatrix
 	$(INSTALL) o/$(MODE)/llama.cpp/quantize/quantize $(PREFIX)/bin/llamafile-quantize
 	$(INSTALL) o/$(MODE)/llama.cpp/llama-bench/llama-bench $(PREFIX)/bin/llamafile-bench
+	$(INSTALL) o/$(MODE)/localscore/localscore $(PREFIX)/bin/localscore
 	$(INSTALL) build/llamafile-convert $(PREFIX)/bin/llamafile-convert
 	$(INSTALL) build/llamafile-upgrade-engine $(PREFIX)/bin/llamafile-upgrade-engine
 	$(INSTALL) o/$(MODE)/llama.cpp/perplexity/perplexity $(PREFIX)/bin/llamafile-perplexity
 
@@ -0,0 +1,129 @@
+# Making a Llamafile Release
+
+There are a few steps in making a Llamafile release which will be detailed in this document.
+
+The two primary artifacts of the release are the `llamafile-<version>.zip` and the binaries for the GitHub release.
+
+## Release Process
+
+Note: Step 2 and 3 are only needed if you are making a new release of the ggml-cuda.so and ggml-rocm.so shared libraries. You only need to do this when you are making changes to the CUDA code or the API's surrounding it. Otherwise you can use the previous release of the shared libraries.
+
+1. Update the version number in `version.h`
+2. Build the ggml-cuda.so and ggml-rocm.so shared libraries on Linux. You need to do this for Llamafile and LocalScore. Llamafile uses TINYBLAS as a default and LocalScore uses CUBLAS as a default for CUDA.
+    - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively.
+    - For LocalScore you can do this by running the script `./localscore/cuda.sh`.
+    - The files will be built and placed your home directory.
+3. Build the ggml-cuda.dll and ggml-rocm.dll shared libraries on Windows. You need to do this for Llamafile and LocalScore.
+    - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively.
+    - For LocalScore you can do this by running the script `./localscore/cuda.bat`.
+    - The files will be built and placed in the `build/release` directory.
+4. Build the project with `make -j8`
+5. Install the built project to your /usr/local/bin directory with `sudo make install PREFIX=/usr/local`
+
+### Llamafile Release Zip
+
+The easiest way to create the release zip is to:
+
+`make install PREFIX=<preferred_dir>/llamafile-<version>`
+
+After the directory is created, you will want to bundle the built shared libraries into the following release binaries:
+
+- `llamafile`
+- `localscore`
+- `whisperfile`
+
+You can do this for each binary with a command like the following:
+
+Note: You MUST put the shared libraries in the same directory as the binary you are creating.
+
+For llamafile and whisperfile you can do the following:
+
+`zipalign -j0 llamafile ggml-cuda.so ggml-rocm.so ggml-cuda.dll ggml-rocm.dll`
+`zipalign -j0 whisperfile ggml-cuda.so ggml-rocm.so ggml-cuda.dll ggml-rocm.dll`
+
+After doing this, delete the ggml-cuda.so and ggml-cuda.dll files from the directory, and copy + rename the ggml-cuda.localscore.so and ggml-cuda.localscore.dll files to the directory.
+
+```
+rm <path_to>/llamafile-<version>/bin/ggml-cuda.so <path_to>/llamafile-<version>/bin/ggml-cuda.dll
+cp ~/ggml-cuda.localscore.so <path_to>/llamafile-<version>/bin/ggml-cuda.so
+cp ~/ggml-cuda.localscore.dll <path_to>/llamafile-<version>/bin/ggml-cuda.dll
+```
+
+For localscore you can now package it:
+
+`zipalign -j0 localscore ggml-cuda.so ggml-rocm.so ggml-cuda.dll ggml-rocm.dll`
+
+After you have done this for all the binaries, you will want to get the existing PDFs (from the prior release) and add them to the directory:
+
+`cp <path_to>/doc/*.pdf <path_to>/llamafile-<version>/share/doc/llamafile/`
+
+The zip is structured as follows.
+
+```
+llamafile-<version>
+|-- README.md
+|-- bin
+|   |-- llamafile
+|   |-- llamafile-bench
+|   |-- llamafile-convert
+|   |-- llamafile-imatrix
+|   |-- llamafile-perplexity
+|   |-- llamafile-quantize
+|   |-- llamafile-tokenize
+|   |-- llamafile-upgrade-engine
+|   |-- llamafiler
+|   |-- llava-quantize
+|   |-- localscore
+|   |-- sdfile
+|   |-- whisperfile
+|   `-- zipalign
+`-- share
+    |-- doc
+    |   `-- llamafile
+    |       |-- llamafile-imatrix.pdf
+    |       |-- llamafile-perplexity.pdf
+    |       |-- llamafile-quantize.pdf
+    |       |-- llamafile.pdf
+    |       |-- llamafiler.pdf
+    |       |-- llava-quantize.pdf
+    |       |-- whisperfile.pdf
+    |       `-- zipalign.pdf
+    `-- man
+        `-- man1
+            |-- llamafile-imatrix.1
+            |-- llamafile-perplexity.1
+            |-- llamafile-quantize.1
+            |-- llamafile.1
+            |-- llamafiler.1
+            |-- llava-quantize.1
+            |-- whisperfile.1
+            `-- zipalign.1
+```
+
+Before you zip the directory, you will want to remove the shared libraries from the directory.
+
+`rm *.so *.dll`
+
+You can zip the directory with the following command:
+
+`zip -r llamafile-<version>.zip llamafile-<version>`
+
+### Llamafile Release Binaries
+
+After you have built the zip it is quite easy to create the release binaries.
+
+The following binaries are part of the release:
+
+- `llamafile`
+- `llamafile-bench`
+- `llamafiler`
+- `sdfile`
+- `localscore`
+- `whisperfile`
+- `zipalign`
+
+You can use the script to create the appropriately named binaries:
+
+`./llamafile/release.sh -v <version> -s <source_dir> -d <dest_dir>`
+
+Make sure to move the llamafile-<version>.zip file to the <dest_dir> as well, and you are good to release after you've tested.
@@ -93,4 +93,4 @@ o/$(MODE)/llama.cpp: 					\
 		o/$(MODE)/llama.cpp/imatrix		\
 		o/$(MODE)/llama.cpp/quantize		\
 		o/$(MODE)/llama.cpp/perplexity		\
-		o/$(MODE)/llama.cpp/llama-bench
+		o/$(MODE)/llama.cpp/llama-bench		\
@@ -17266,6 +17266,18 @@ GGML_CALL int ggml_backend_cuda_get_device_count() {
     return ggml_cuda_info().device_count;
 }
 
+GGML_CALL void ggml_backend_cuda_get_device_properties(int device, struct ggml_cuda_device_properties * properties) {
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+
+    strncpy(properties->name, prop.name, sizeof(properties->name) - 1);
+    properties->totalGlobalMem = prop.totalGlobalMem;
+    properties->multiProcessorCount = prop.multiProcessorCount;
+    properties->major = prop.major;
+    properties->minor = prop.minor;
+    snprintf(properties->compute, sizeof(properties->compute), "%d.%d", prop.major, prop.minor);
+}
+
 GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
     cudaDeviceProp prop;
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
 
@@ -20,6 +20,15 @@ extern "C" {
 
 #define GGML_CUDA_MAX_DEVICES       16
 
+struct ggml_cuda_device_properties {
+    char name[256];
+    size_t totalGlobalMem;
+    int multiProcessorCount;
+    int major;
+    int minor;
+    char compute[8];
+};
+
 GGML_API GGML_CALL bool ggml_cuda_link(const struct ggml_backend_api * backend_api);
 
 // backend API
@@ -37,6 +46,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 
 GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_properties(int device, struct ggml_cuda_device_properties * properties);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
 
 
@@ -35,6 +35,15 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
+struct ggml_metal_device_properties {
+    char name[256];
+    float memory;
+    int core_count;
+    int metal_version;
+    int gpu_family;
+    int gpu_family_common;
+};
+
 void ggml_metal_link(const struct ggml_backend_api *);
 
 //
@@ -56,6 +65,10 @@ GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml
 
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 
+GGML_API void ggml_backend_metal_get_device_properties(ggml_backend_t backend, struct ggml_metal_device_properties * properties);
+
+GGML_API void ggml_backend_metal_get_device_memory_usage(ggml_backend_t backend, float * used, float * total);
+
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
 
@@ -304,6 +304,10 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
     id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
 
+    int family;
+    int family_common;
+    int metal_version;
+
     dispatch_queue_t d_queue;
 
     struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
@@ -507,20 +511,23 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
         for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
             if ([ctx->device supportsFamily:i]) {
                 GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
+                ctx->family = i - MTLGPUFamilyApple1 + 1;
                 break;
             }
         }
 
         for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
             if ([ctx->device supportsFamily:i]) {
                 GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
+                ctx->family_common = i - MTLGPUFamilyCommon1 + 1;
                 break;
             }
         }
 
         for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
             if ([ctx->device supportsFamily:i]) {
                 GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
+                ctx->metal_version = i - MTLGPUFamilyMetal3 + 3;
                 break;
             }
         }
@@ -3367,6 +3374,27 @@ void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_ca
     ctx->abort_callback_data = user_data;
 }
 
+void ggml_backend_metal_get_device_properties(ggml_backend_t backend, struct ggml_metal_device_properties * properties) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+    
+    strncpy(properties->name, [ctx->device name].UTF8String, sizeof(ctx->device.name));
+    properties->memory = ctx->device.recommendedMaxWorkingSetSize / 1073741824.0 ; // TODO is this what i want? mb? 1024*1024*1024 is bytes to gb
+    properties->gpu_family = ctx->family;
+    properties->gpu_family_common = ctx->family_common;
+    properties->metal_version = ctx->metal_version;
+}
+
+void ggml_backend_metal_get_device_memory_usage(ggml_backend_t backend, float * used, float * total) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+
+    *used = (float)ctx->device.currentAllocatedSize / 1024.0 / 1024.0;
+    *total = (float)ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0;
+}
+
 bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
     GGML_ASSERT(ggml_backend_is_metal(backend));
 
 
@@ -17565,6 +17565,10 @@ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
     return ctx->cparams.pooling_type;
 }
 
+int32_t llama_model_quant_str(const struct llama_model * model, char * buf, size_t buf_size) {
+    return snprintf(buf, buf_size, "%s", llama_model_ftype_name(model->ftype).c_str());
+}
+
 int32_t llama_n_vocab(const struct llama_model * model) {
     return model->hparams.n_vocab;
 }
 
@@ -478,6 +478,9 @@ extern "C" {
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
 
+    // Get the quantization of the model
+    LLAMA_API int32_t llama_model_quant_str(const struct llama_model * model, char * buf, size_t buf_size);
+
     // Functions to access the model's GGUF metadata scalar values
     // - The functions return the length of the string on success, or -1 on failure
     // - The output string is always null-terminated and cleared on failure
 
@@ -13,6 +13,8 @@ o/$(MODE)/llama.cpp/main/main:					\
 		o/$(MODE)/llama.cpp/main/embedding.o		\
 		o/$(MODE)/llamafile/server/server.a		\
 		o/$(MODE)/llama.cpp/server/server.a		\
+		o/$(MODE)/localscore/localscore.a			\
+		o/$(MODE)/third_party/mbedtls/mbedtls.a		\
 		o/$(MODE)/llama.cpp/llava/llava.a		\
 		o/$(MODE)/llama.cpp/llama.cpp.a			\
 		o/$(MODE)/llamafile/highlight/highlight.a	\
@@ -21,6 +23,7 @@ o/$(MODE)/llama.cpp/main/main:					\
 		o/$(MODE)/llamafile/server/main.1.asc.zip.o	\
 		$(LLAMA_CPP_SERVER_ASSETS:%=o/$(MODE)/%.zip.o)	\
 		$(LLAMAFILE_SERVER_ASSETS:%=o/$(MODE)/%.zip.o)	\
+		$(THIRD_PARTY_MBEDTLS_A_CERTS:%=o/$(MODE)/%.zip.o) \
 
 $(LLAMA_CPP_MAIN_OBJS): llama.cpp/main/BUILD.mk
 
 
@@ -27,6 +27,7 @@
 #include "llama.cpp/ggml-metal.h"
 #include "llama.cpp/llava/llava.h"
 #include "llama.cpp/server/server.h"
+#include "localscore/localscore.h"
 #include "llamafile/server/prog.h"
 
 static llama_context           ** g_ctx;
@@ -152,6 +153,7 @@ enum Program {
     CHATBOT,
     EMBEDDING,
     LLAMAFILER,
+    LOCALSCORE
 };
 
 enum Program determine_program(char *argv[]) {
@@ -168,6 +170,8 @@ enum Program determine_program(char *argv[]) {
             prog = EMBEDDING;
         } else if (!strcmp(argv[i], "--v2")) {
             v2 = true;
+        } else if (!strcmp(argv[i], "--localscore")) {
+            prog = LOCALSCORE;
         }
     }
     if (prog == SERVER && v2) {
@@ -221,6 +225,10 @@ int main(int argc, char ** argv) {
         return embedding_cli(argc, argv);
     }
 
+    if (prog == LOCALSCORE) {
+        return localscore_cli(argc, argv);
+    }
+
     launch_sigint_thread();
 
     gpt_params params;
Original file line number	Diff line number	Diff line change
`@@ -17565,6 +17565,10 @@ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {`
`17565`	`17565`	`return ctx->cparams.pooling_type;`
`17566`	`17566`	`}`
`17567`	`17567`
	`17568`	`+int32_t llama_model_quant_str(const struct llama_model * model, char * buf, size_t buf_size) {`
	`17569`	`+ return snprintf(buf, buf_size, "%s", llama_model_ftype_name(model->ftype).c_str());`
	`17570`	`+}`
	`17571`	`+`
`17568`	`17572`	`int32_t llama_n_vocab(const struct llama_model * model) {`
`17569`	`17573`	`return model->hparams.n_vocab;`
`17570`	`17574`	`}`